Summarize the reviews



In [1]:

    
all_reviews = (spark
    .read
    .json('../../data/raw_data/reviews_Home_and_Kitchen_5.json.gz'))



In [3]:

    
from pyspark.sql.functions import col, expr, udf, trim
from pyspark.sql.types import IntegerType
import re

remove_punctuation = udf(lambda line: re.sub('[^A-Za-z\s]', '', line))
make_binary = udf(lambda rating: 0 if rating in [1, 2] else 1, IntegerType())

reviews = (all_reviews
    .na.fill({ 'reviewerName': 'Unknown' })
    .filter(col('overall').isin([1, 2, 5]))
    .withColumn('label', make_binary(col('overall')))
    .select(col('label').cast('int'), remove_punctuation('summary').alias('summary'))
    .filter(trim(col('summary')) != ''))

Splitting data and balancing skewness



In [4]:

    
train, test = reviews.randomSplit([.8, .2], seed=5436L)



In [5]:

    
def multiply_dataset(dataset, n):
    return dataset if n <= 1 else dataset.union(multiply_dataset(dataset, n - 1))



In [6]:

    
reviews_good = train.filter('label == 1')
reviews_bad = train.filter('label == 0')

reviews_bad_multiplied = multiply_dataset(reviews_bad, reviews_good.count() / reviews_bad.count())


train_reviews = reviews_bad_multiplied.union(reviews_good)

Benchmark: predict by distribution



In [13]:

    
accuracy = reviews_good.count() / float(train.count())
print('Always predicting 5 stars accuracy: {0}'.format(accuracy))









    



Always predicting 5 stars accuracy: 0.87139780791

Learning pipeline



In [8]:

    
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(inputCol='summary', outputCol='words')

pipeline = Pipeline(stages=[
    tokenizer, 
    StopWordsRemover(inputCol='words', outputCol='filtered_words'),
    HashingTF(inputCol='filtered_words', outputCol='rawFeatures', numFeatures=120000),
    IDF(inputCol='rawFeatures', outputCol='features'),
    LogisticRegression(regParam=.3, elasticNetParam=.01)
])

Testing the model accuracy



In [9]:

    
model = pipeline.fit(train_reviews)



In [10]:

    
from pyspark.ml.evaluation import BinaryClassificationEvaluator

prediction = model.transform(test)
BinaryClassificationEvaluator().evaluate(prediction)









    Out[10]:





0.9168045600888572

Using model to extract the most predictive words



In [11]:

    
from pyspark.sql.functions import explode
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType

words = (tokenizer
    .transform(reviews)
    .select(explode(col('words')).alias('summary')))

predictors = (model
    .transform(words)
    .select(col('summary').alias('word'), 'probability'))

first = udf(lambda x: x[0].item(), FloatType())
second = udf(lambda x: x[1].item(), FloatType())

predictive_words = (predictors
   .select(
       'word', 
       second(col('probability')).alias('positive'), 
       first(col('probability')).alias('negative'))
   .groupBy('word')
   .agg(
       F.max('positive').alias('positive'),
       F.max('negative').alias('negative')))

positive_predictive_words = (predictive_words
    .select(col('word').alias('positive_word'), col('positive').alias('pos_prob'))
    .sort('pos_prob', ascending=False))

negative_predictive_words = (predictive_words
    .select(col('word').alias('negative_word'), col('negative').alias('neg_prob'))
    .sort('neg_prob', ascending=False))



In [12]:

    
import pandas as pd
pd.set_option('display.max_rows', 100)

pd.concat(
    [ positive_predictive_words.limit(100).toPandas(),
      negative_predictive_words.limit(100).toPandas() ],
    axis=1)









    Out[12]:







  
    
      
      positive_word
      pos_prob
      negative_word
      neg_prob
    
  
  
    
      0
      toxic
      0.702056
      worst
      0.693118
    
    
      1
      perfect
      0.702056
      za
      0.681185
    
    
      2
      excellent
      0.698975
      disappointed
      0.681185
    
    
      3
      awesome
      0.695059
      disappointing
      0.677256
    
    
      4
      fantastic
      0.690034
      disappointment
      0.669026
    
    
      5
      dalla
      0.689843
      terrible
      0.667437
    
    
      6
      amazing
      0.689843
      poor
      0.667185
    
    
      7
      wonderful
      0.687460
      useless
      0.660330
    
    
      8
      five
      0.683558
      bango
      0.660330
    
    
      9
      fabulous
      0.678689
      worthless
      0.658721
    
    
      10
      bailey
      0.677647
      gingergarlic
      0.658486
    
    
      11
      handy
      0.677647
      flimsy
      0.658486
    
    
      12
      blox
      0.677111
      grabber
      0.658486
    
    
      13
      regime
      0.677111
      returned
      0.658471
    
    
      14
      love
      0.677111
      poorly
      0.652594
    
    
      15
      best
      0.673670
      junk
      0.652316
    
    
      16
      great
      0.670816
      jarsgreat
      0.652202
    
    
      17
      expectedgot
      0.670816
      hamiliton
      0.652202
    
    
      18
      perfection
      0.667853
      defective
      0.652202
    
    
      19
      silex
      0.666016
      awful
      0.651954
    
    
      20
      loves
      0.666016
      infuse
      0.651954
    
    
      21
      needed
      0.664291
      coctails
      0.651954
    
    
      22
      perfectly
      0.663811
      meh
      0.651746
    
    
      23
      outstanding
      0.662990
      ok
      0.651042
    
    
      24
      wowloving
      0.662849
      microbopper
      0.651042
    
    
      25
      exactly
      0.662849
      broke
      0.648492
    
    
      26
      saves
      0.660796
      negive
      0.647956
    
    
      27
      terrific
      0.660496
      postal
      0.647956
    
    
      28
      classy
      0.659689
      horrible
      0.647956
    
    
      29
      beat
      0.659624
      cheaply
      0.644965
    
    
      30
      solved
      0.659568
      dangerous
      0.644139
    
    
      31
      simple
      0.658571
      breaks
      0.642654
    
    
      32
      finally
      0.658427
      eh
      0.642366
    
    
      33
      ruffled
      0.658241
      charges
      0.642366
    
    
      34
      beautiful
      0.658241
      okay
      0.642030
    
    
      35
      yay
      0.655354
      mediocre
      0.641426
    
    
      36
      easy
      0.655322
      flawed
      0.641148
    
    
      37
      fun
      0.654575
      weak
      0.639523
    
    
      38
      pleasantly
      0.652444
      managing
      0.639406
    
    
      39
      sooner
      0.649271
      lousy
      0.639406
    
    
      40
      versatile
      0.649171
      broken
      0.637405
    
    
      41
      sharpened
      0.649171
      beware
      0.636521
    
    
      42
      gift
      0.648284
      doesnt
      0.636136
    
    
      43
      gorgeous
      0.646563
      akrobins
      0.636136
    
    
      44
      mugthermos
      0.646563
      madewelding
      0.634321
    
    
      45
      wow
      0.643426
      rusted
      0.634321
    
    
      46
      casingborder
      0.643426
      uncomfortable
      0.634118
    
    
      47
      nice
      0.643066
      nightmare
      0.633125
    
    
      48
      ont
      0.642420
      shoddiest
      0.633116
    
    
      49
      solid
      0.642420
      cracked
      0.633116
    
    
      50
      favorite
      0.641204
      soso
      0.633033
    
    
      51
      elegant
      0.641035
      handheal
      0.633033
    
    
      52
      charm
      0.640674
      garbage
      0.632779
    
    
      53
      value
      0.640382
      overpriced
      0.631248
    
    
      54
      yummy
      0.639512
      fail
      0.630755
    
    
      55
      kiss
      0.639512
      died
      0.630731
    
    
      56
      penny
      0.638611
      short
      0.628559
    
    
      57
      cozy
      0.638148
      frustrating
      0.628401
    
    
      58
      superb
      0.636925
      rusts
      0.628361
    
    
      59
      affordable
      0.636830
      noisy
      0.628036
    
    
      60
      heaven
      0.636549
      lasted
      0.627848
    
    
      61
      exceeded
      0.635176
      stinks
      0.627412
    
    
      62
      saver
      0.634595
      pumped
      0.626616
    
    
      63
      comfy
      0.634482
      theory
      0.626616
    
    
      64
      accessorie
      0.634482
      dissapointed
      0.626493
    
    
      65
      tool
      0.634073
      bad
      0.626172
    
    
      66
      sturdy
      0.633408
      seniors
      0.626172
    
    
      67
      nonbasic
      0.633408
      helpers
      0.626172
    
    
      68
      lovely
      0.632788
      nori
      0.626172
    
    
      69
      kitchen
      0.632656
      cornerssee
      0.625675
    
    
      70
      canning
      0.631700
      awkward
      0.625675
    
    
      71
      highly
      0.629741
      mata
      0.625675
    
    
      72
      addition
      0.629598
      unreliable
      0.625524
    
    
      73
      measuring
      0.629445
      uneven
      0.625510
    
    
      74
      storage
      0.628716
      simpleyet
      0.624851
    
    
      75
      essential
      0.628671
      difficult
      0.624851
    
    
      76
      unique
      0.626989
      leaked
      0.623439
    
    
      77
      must
      0.626507
      worse
      0.623415
    
    
      78
      complaints
      0.626157
      windw
      0.623415
    
    
      79
      joining
      0.625883
      concept
      0.623266
    
    
      80
      cake
      0.625883
      waste
      0.622812
    
    
      81
      mom
      0.625141
      windowmounted
      0.622812
    
    
      82
      beats
      0.624665
      leaks
      0.622331
    
    
      83
      aquality
      0.624426
      leaky
      0.622287
    
    
      84
      organized
      0.624426
      misleading
      0.622105
    
    
      85
      husband
      0.624266
      fragile
      0.621708
    
    
      86
      helps
      0.624006
      disapointed
      0.621552
    
    
      87
      pleasure
      0.622396
      loveable
      0.621552
    
    
      88
      loving
      0.622124
      crap
      0.621500
    
    
      89
      convenient
      0.621260
      warped
      0.621298
    
    
      90
      works
      0.620345
      yuck
      0.620694
    
    
      91
      baking
      0.620343
      nystrip
      0.620396
    
    
      92
      fits
      0.619780
      rusty
      0.620396
    
    
      93
      windtunel
      0.619480
      rip
      0.620298
    
    
      94
      stylish
      0.619480
      inaccurate
      0.619848
    
    
      95
      circulonthis
      0.618929
      hate
      0.619539
    
    
      96
      workhorse
      0.618929
      asteroid
      0.619539
    
    
      97
      wife
      0.618862
      dud
      0.618871
    
    
      98
      delight
      0.618565
      crappy
      0.618867
    
    
      99
      emglish
      0.618565
      skip
      0.618234

	positive_word	pos_prob	negative_word	neg_prob
0	toxic	0.702056	worst	0.693118
1	perfect	0.702056	za	0.681185
2	excellent	0.698975	disappointed	0.681185
3	awesome	0.695059	disappointing	0.677256
4	fantastic	0.690034	disappointment	0.669026
5	dalla	0.689843	terrible	0.667437
6	amazing	0.689843	poor	0.667185
7	wonderful	0.687460	useless	0.660330
8	five	0.683558	bango	0.660330
9	fabulous	0.678689	worthless	0.658721
10	bailey	0.677647	gingergarlic	0.658486
11	handy	0.677647	flimsy	0.658486
12	blox	0.677111	grabber	0.658486
13	regime	0.677111	returned	0.658471
14	love	0.677111	poorly	0.652594
15	best	0.673670	junk	0.652316
16	great	0.670816	jarsgreat	0.652202
17	expectedgot	0.670816	hamiliton	0.652202
18	perfection	0.667853	defective	0.652202
19	silex	0.666016	awful	0.651954
20	loves	0.666016	infuse	0.651954
21	needed	0.664291	coctails	0.651954
22	perfectly	0.663811	meh	0.651746
23	outstanding	0.662990	ok	0.651042
24	wowloving	0.662849	microbopper	0.651042
25	exactly	0.662849	broke	0.648492
26	saves	0.660796	negive	0.647956
27	terrific	0.660496	postal	0.647956
28	classy	0.659689	horrible	0.647956
29	beat	0.659624	cheaply	0.644965
30	solved	0.659568	dangerous	0.644139
31	simple	0.658571	breaks	0.642654
32	finally	0.658427	eh	0.642366
33	ruffled	0.658241	charges	0.642366
34	beautiful	0.658241	okay	0.642030
35	yay	0.655354	mediocre	0.641426
36	easy	0.655322	flawed	0.641148
37	fun	0.654575	weak	0.639523
38	pleasantly	0.652444	managing	0.639406
39	sooner	0.649271	lousy	0.639406
40	versatile	0.649171	broken	0.637405
41	sharpened	0.649171	beware	0.636521
42	gift	0.648284	doesnt	0.636136
43	gorgeous	0.646563	akrobins	0.636136
44	mugthermos	0.646563	madewelding	0.634321
45	wow	0.643426	rusted	0.634321
46	casingborder	0.643426	uncomfortable	0.634118
47	nice	0.643066	nightmare	0.633125
48	ont	0.642420	shoddiest	0.633116
49	solid	0.642420	cracked	0.633116
50	favorite	0.641204	soso	0.633033
51	elegant	0.641035	handheal	0.633033
52	charm	0.640674	garbage	0.632779
53	value	0.640382	overpriced	0.631248
54	yummy	0.639512	fail	0.630755
55	kiss	0.639512	died	0.630731
56	penny	0.638611	short	0.628559
57	cozy	0.638148	frustrating	0.628401
58	superb	0.636925	rusts	0.628361
59	affordable	0.636830	noisy	0.628036
60	heaven	0.636549	lasted	0.627848
61	exceeded	0.635176	stinks	0.627412
62	saver	0.634595	pumped	0.626616
63	comfy	0.634482	theory	0.626616
64	accessorie	0.634482	dissapointed	0.626493
65	tool	0.634073	bad	0.626172
66	sturdy	0.633408	seniors	0.626172
67	nonbasic	0.633408	helpers	0.626172
68	lovely	0.632788	nori	0.626172
69	kitchen	0.632656	cornerssee	0.625675
70	canning	0.631700	awkward	0.625675
71	highly	0.629741	mata	0.625675
72	addition	0.629598	unreliable	0.625524
73	measuring	0.629445	uneven	0.625510
74	storage	0.628716	simpleyet	0.624851
75	essential	0.628671	difficult	0.624851
76	unique	0.626989	leaked	0.623439
77	must	0.626507	worse	0.623415
78	complaints	0.626157	windw	0.623415
79	joining	0.625883	concept	0.623266
80	cake	0.625883	waste	0.622812
81	mom	0.625141	windowmounted	0.622812
82	beats	0.624665	leaks	0.622331
83	aquality	0.624426	leaky	0.622287
84	organized	0.624426	misleading	0.622105
85	husband	0.624266	fragile	0.621708
86	helps	0.624006	disapointed	0.621552
87	pleasure	0.622396	loveable	0.621552
88	loving	0.622124	crap	0.621500
89	convenient	0.621260	warped	0.621298
90	works	0.620345	yuck	0.620694
91	baking	0.620343	nystrip	0.620396
92	fits	0.619780	rusty	0.620396
93	windtunel	0.619480	rip	0.620298
94	stylish	0.619480	inaccurate	0.619848
95	circulonthis	0.618929	hate	0.619539
96	workhorse	0.618929	asteroid	0.619539
97	wife	0.618862	dud	0.618871
98	delight	0.618565	crappy	0.618867
99	emglish	0.618565	skip	0.618234